Tidyverse

Author

Agam Safaruddin

1 Requirements

1.1 Libraries

with eval: true

library(readxl)
library(lubridate)
#> 
#> Attache Paket: 'lubridate'
#> Die folgenden Objekte sind maskiert von 'package:base':
#> 
#>     date, intersect, setdiff, union
library(tidyverse)
#> ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
#> ✔ ggplot2 3.3.5     ✔ purrr   0.3.4
#> ✔ tibble  3.1.6     ✔ dplyr   1.0.8
#> ✔ tidyr   1.2.0     ✔ stringr 1.4.0
#> ✔ readr   2.1.2     ✔ forcats 0.5.1
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ lubridate::as.difftime() masks base::as.difftime()
#> ✖ lubridate::date()        masks base::date()
#> ✖ dplyr::filter()          masks stats::filter()
#> ✖ lubridate::intersect()   masks base::intersect()
#> ✖ dplyr::lag()             masks stats::lag()
#> ✖ lubridate::setdiff()     masks base::setdiff()
#> ✖ lubridate::union()       masks base::union()
library(readr)
library(ggplot2)

1.2 Data imports

bikes_tbl = read_excel("ds_data/ds_data/01_bike_sales/01_raw_data/bikes.xlsx")
orderlines_tbl = read_excel("ds_data/ds_data/01_bike_sales/01_raw_data/orderlines.xlsx")
#> New names:
#> • `` -> `...1`
bikeshops_tbl = read_excel("ds_data/ds_data/01_bike_sales/01_raw_data/bikeshops.xlsx")

1.3 Examine

head(bikes_tbl, n=5)
head(orderlines_tbl, n=5)
head(bikeshops_tbl, n=5)

1.4 Data Manipulation and cleaning

  • product.id from orderlines.xlsx are the same as bike.id from bikes.xlsx
  • Therefore they are combined (both dataset share product.id)
  • Combining customer.id from orderlines.xlsx and bikeshop.id from bikeshops.xlsx
  • Because the bikeshops (middle man) are buying and selling to customers, therefore they are the customer of the supplier (Seller)
dplyr::left_join(orderlines_tbl, bikes_tbl, by = c("product.id" = "bike.id"))
bike_orderlines_joined_tbl = orderlines_tbl %>%
    dplyr::left_join(bikes_tbl, by = c("product.id" = "bike.id")) %>% 
    dplyr::left_join(bikeshops_tbl, by = c("customer.id" = "bikeshop.id"))

head(bike_orderlines_joined_tbl, n = 5)
bike_orderlines_wrangled_tbl = bike_orderlines_joined_tbl %>%
    separate(col = category,
            into = c("category.1", "category.2", "category.3"),
            sep = " - ") %>%
    mutate(total.price = price * quantity) %>%
    dplyr::select(-...1, -gender) %>%
    dplyr::select(-ends_with(".id")) %>%
    bind_cols(bike_orderlines_joined_tbl %>% dplyr::select(order.id)) %>%
    dplyr::select(order.id, contains("order"), contains("model"), contains("category"),
          price, quantity, total.price, everything()) %>%
    rename(bikeshop = name) %>%
    set_names(names(.) %>% str_replace_all("\\.", "_"))

2 Challenge

2.1 Analyse sales by state with bar plot

Split column of location

With eval: true

location_splitted = separate(
    bike_orderlines_wrangled_tbl,
    col = location,
    into = c("city", "state"),
    sep = ",")

location_splitted

Calculate sales, group by state

sales_by_state_tbl = location_splitted %>%
    dplyr::select(state, total_price) %>%
    mutate(state_1 = state) %>%
    group_by(state_1) %>% 
    summarize(sales = sum(total_price)) %>%
    mutate(sales_text = scales::dollar(sales,
                                      big.mark = ".",
                                      decimal.mark = ",",
                                      prefix = "",
                                      suffix = "€"))

sales_by_state_tbl

Plot bar

sales_by_state_tbl %>%
    ggplot(aes(x = state_1, y = sales)) +
    geom_col(fill = "#2DC6D6") + 
    geom_label(aes(label = sales_text)) +
    scale_y_continuous(labels = scales::dollar_format(big.mark = ".",
                                                     decimal.mark = ",",
                                                     prefix = "",
                                                     suffix = "€")) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +

    labs(
        title = "Revenue by State",
        subtitle = "",
        x = "state",
        y = "Revenue"
    )

2.2 Analyse sales by state and year with bar plot

calculate sales and group by state and year

sales_by_state_year_tbl = location_splitted %>%
    dplyr::select(order_date, state, total_price) %>%
    mutate(year_1 = year(order_date)) %>%
    group_by(state, year_1) %>% 
    summarize(sales = sum(total_price)) %>%
    mutate(sales_text = scales::dollar(sales,
                                      big.mark = ".",
                                      decimal.mark = ",",
                                      prefix = "",
                                      suffix = "€"))
#> `summarise()` has grouped output by 'state'. You can override using the
#> `.groups` argument.
sales_by_state_year_tbl

Plot bar of all each states

sales_by_state_year_tbl %>% 
    ggplot(aes(x = year_1, y = sales, fill = state)) +
    geom_col() +
    facet_wrap(~ state) +
    scale_y_continuous(labels = scales::dollar_format(big.mark = ".",
                                                     decimal.mark = ",",
                                                     prefix = "",
                                                     suffix = "€")) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +

    labs(
        title = "Revenue by year and state",
        subtitle = "",
        fill = "States"
    )